import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from pandasql import sqldf
import plotly.express as px
import pandas as pd
df = pd.read_csv("Documents//Python//bank_transactions.csv")
df.head()
| TransactionID | CustomerID | CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount (INR) | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | T1 | C5841053 | 10/1/94 | F | JAMSHEDPUR | 17819.05 | 2/8/16 | 143207 | 25.0 |
| 1 | T2 | C2142763 | 4/4/57 | M | JHAJJAR | 2270.69 | 2/8/16 | 141858 | 27999.0 |
| 2 | T3 | C4417068 | 26/11/96 | F | MUMBAI | 17874.44 | 2/8/16 | 142712 | 459.0 |
| 3 | T4 | C5342380 | 14/9/73 | F | MUMBAI | 866503.21 | 2/8/16 | 142714 | 2060.0 |
| 4 | T5 | C9031234 | 24/3/88 | F | NAVI MUMBAI | 6714.43 | 2/8/16 | 181156 | 1762.5 |
df.rename(columns = {'TransactionAmount (INR)':'TransactionAmount'}, inplace = True)
df.head()
| TransactionID | CustomerID | CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | T1 | C5841053 | 10/1/94 | F | JAMSHEDPUR | 17819.05 | 2/8/16 | 143207 | 25.0 |
| 1 | T2 | C2142763 | 4/4/57 | M | JHAJJAR | 2270.69 | 2/8/16 | 141858 | 27999.0 |
| 2 | T3 | C4417068 | 26/11/96 | F | MUMBAI | 17874.44 | 2/8/16 | 142712 | 459.0 |
| 3 | T4 | C5342380 | 14/9/73 | F | MUMBAI | 866503.21 | 2/8/16 | 142714 | 2060.0 |
| 4 | T5 | C9031234 | 24/3/88 | F | NAVI MUMBAI | 6714.43 | 2/8/16 | 181156 | 1762.5 |
df.describe()
| CustAccountBalance | TransactionTime | TransactionAmount | |
|---|---|---|---|
| count | 1.046198e+06 | 1.048567e+06 | 1.048567e+06 |
| mean | 1.154035e+05 | 1.570875e+05 | 1.574335e+03 |
| std | 8.464854e+05 | 5.126185e+04 | 6.574743e+03 |
| min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 4.721760e+03 | 1.240300e+05 | 1.610000e+02 |
| 50% | 1.679218e+04 | 1.642260e+05 | 4.590300e+02 |
| 75% | 5.765736e+04 | 2.000100e+05 | 1.200000e+03 |
| max | 1.150355e+08 | 2.359590e+05 | 1.560035e+06 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1048567 entries, 0 to 1048566 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TransactionID 1048567 non-null object 1 CustomerID 1048567 non-null object 2 CustomerDOB 1045170 non-null object 3 CustGender 1047467 non-null object 4 CustLocation 1048416 non-null object 5 CustAccountBalance 1046198 non-null float64 6 TransactionDate 1048567 non-null object 7 TransactionTime 1048567 non-null int64 8 TransactionAmount 1048567 non-null float64 dtypes: float64(2), int64(1), object(6) memory usage: 72.0+ MB
df.isnull().sum()
TransactionID 0 CustomerID 0 CustomerDOB 3397 CustGender 1100 CustLocation 151 CustAccountBalance 2369 TransactionDate 0 TransactionTime 0 TransactionAmount 0 dtype: int64
100*(df.isnull().sum()/len(df))
TransactionID 0.000000 CustomerID 0.000000 CustomerDOB 0.323966 CustGender 0.104905 CustLocation 0.014401 CustAccountBalance 0.225927 TransactionDate 0.000000 TransactionTime 0.000000 TransactionAmount 0.000000 dtype: float64
df[df['CustGender'].isnull()].head(10)
| TransactionID | CustomerID | CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|---|---|
| 176 | T177 | C5698953 | 1/1/1800 | NaN | NAVI MUMBAI | 8512.28 | 12/8/16 | 155727 | 3449.00 |
| 2124 | T2125 | C4525944 | 1/1/1800 | NaN | BANGALORE | 7115.05 | 21/10/16 | 201052 | 584.06 |
| 5061 | T5062 | C7541851 | 1/1/1800 | NaN | MUMBAI | 10669.85 | 22/9/16 | 61227 | 1500.00 |
| 13035 | T13036 | C3788162 | 1/1/1800 | NaN | MULUND WEST | 65989.63 | 26/9/16 | 210819 | 336.52 |
| 14878 | T14879 | C6239153 | 1/1/1800 | NaN | NAVI MUMBAI | 603.00 | 26/9/16 | 163122 | 1494.00 |
| 15314 | T15315 | C5793145 | 12/12/50 | NaN | MUMBAI | 597.00 | 26/9/16 | 92928 | 1.00 |
| 15768 | T15769 | C8891925 | 1/1/1800 | NaN | NAVI MUMBAI | 31691.58 | 26/9/16 | 205542 | 4448.64 |
| 16788 | T16789 | C3893120 | 12/12/50 | NaN | MUMBAI | 597.00 | 26/9/16 | 93137 | 1.00 |
| 17300 | T17301 | C7093134 | 12/12/50 | NaN | MUMBAI | 597.00 | 26/9/16 | 93033 | 1.00 |
| 17942 | T17943 | C5239087 | 1/1/1800 | NaN | MUMBAI | 97178.78 | 26/9/16 | 155706 | 3310.00 |
df[df['CustAccountBalance'].isnull()].head()
| TransactionID | CustomerID | CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|---|---|
| 29 | T30 | C8736958 | 19/12/86 | M | GRAM PANCHAYTH HYDERABAD | NaN | 3/8/16 | 160636 | 2600.0 |
| 1271 | T1272 | C2314972 | 29/7/93 | M | THANE | NaN | 21/10/16 | 203227 | 53.0 |
| 2513 | T2514 | C3742855 | 18/3/87 | F | MUMBAI | NaN | 21/10/16 | 150602 | 300.0 |
| 2831 | T2832 | C3221342 | 27/2/02 | M | NEW DELHI | NaN | 21/10/16 | 54746 | 1.0 |
| 3550 | T3551 | C5921379 | 27/2/02 | M | NEW DELHI | NaN | 21/10/16 | 54704 | 1.0 |
newdf = df.dropna(axis=0, inplace = False) # since there are over a million records dropping a few rows with null values would not significantly impact the analysis
newdf.isnull().sum()
TransactionID 0 CustomerID 0 CustomerDOB 0 CustGender 0 CustLocation 0 CustAccountBalance 0 TransactionDate 0 TransactionTime 0 TransactionAmount 0 dtype: int64
print(newdf.shape,df.shape)
(1041614, 9) (1048567, 9)
print(len(newdf)-len(df))
-6953
newdf.drop(['TransactionID','CustomerID'],axis = 1, inplace = True) ## Removing TransactionID and CustomerID) #as I feel there are no that useful
/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py:4906: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
newdf.head()
| CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|
| 0 | 10/1/94 | F | JAMSHEDPUR | 17819.05 | 2/8/16 | 143207 | 25.0 |
| 1 | 4/4/57 | M | JHAJJAR | 2270.69 | 2/8/16 | 141858 | 27999.0 |
| 2 | 26/11/96 | F | MUMBAI | 17874.44 | 2/8/16 | 142712 | 459.0 |
| 3 | 14/9/73 | F | MUMBAI | 866503.21 | 2/8/16 | 142714 | 2060.0 |
| 4 | 24/3/88 | F | NAVI MUMBAI | 6714.43 | 2/8/16 | 181156 | 1762.5 |
newdf.corr() # there is no significance relation between variables
| CustAccountBalance | TransactionTime | TransactionAmount | |
|---|---|---|---|
| CustAccountBalance | 1.000000 | -0.004012 | 0.062692 |
| TransactionTime | -0.004012 | 1.000000 | 0.008296 |
| TransactionAmount | 0.062692 | 0.008296 | 1.000000 |
newdf.cov()
| CustAccountBalance | TransactionTime | TransactionAmount | |
|---|---|---|---|
| CustAccountBalance | 7.170041e+11 | -1.741558e+08 | 3.483155e+08 |
| TransactionTime | -1.741558e+08 | 2.627948e+09 | 2.790504e+06 |
| TransactionAmount | 3.483155e+08 | 2.790504e+06 | 4.305281e+07 |
newdf.CustomerDOB.value_counts() # abnormally high number of people have birth year as 1800
1/1/1800 56292
1/1/89 809
1/1/90 784
6/8/91 698
1/1/91 665
...
2/12/51 1
20/3/52 1
26/9/47 1
4/10/41 1
24/10/44 1
Name: CustomerDOB, Length: 17233, dtype: int64
newdf = newdf.loc[~(newdf['CustomerDOB'] == '1/1/1800')] # Removing them since it is probably a mistake in the entry books
newdf.CustomerDOB.value_counts()
1/1/89 809
1/1/90 784
6/8/91 698
1/1/91 665
1/1/92 631
...
23/2/05 1
28/11/42 1
23/9/49 1
14/3/40 1
24/10/44 1
Name: CustomerDOB, Length: 17232, dtype: int64
newdf['TransactionDate'] = pd.to_datetime(newdf['TransactionDate'], format = '%d/%m/%y')
newdf['CustomerDOB'] = pd.to_datetime(newdf['CustomerDOB'], format = '%d/%m/%y')
newdf.CustomerDOB # Year 2057 ??
0 1994-01-10
1 2057-04-04
2 1996-11-26
3 1973-09-14
4 1988-03-24
...
1048562 1990-04-08
1048563 1992-02-20
1048564 1989-05-18
1048565 1978-08-30
1048566 1984-03-05
Name: CustomerDOB, Length: 985322, dtype: datetime64[ns]
newdf.loc[newdf['CustomerDOB'].dt.year >= 2021, ['CustomerDOB']] -= pd.DateOffset(years = 100)
newdf.head() # Subtracted 100 years from the records that were wrongly converted!
| CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|
| 0 | 1994-01-10 | F | JAMSHEDPUR | 17819.05 | 2016-08-02 | 143207 | 25.0 |
| 1 | 1957-04-04 | M | JHAJJAR | 2270.69 | 2016-08-02 | 141858 | 27999.0 |
| 2 | 1996-11-26 | F | MUMBAI | 17874.44 | 2016-08-02 | 142712 | 459.0 |
| 3 | 1973-09-14 | F | MUMBAI | 866503.21 | 2016-08-02 | 142714 | 2060.0 |
| 4 | 1988-03-24 | F | NAVI MUMBAI | 6714.43 | 2016-08-02 | 181156 | 1762.5 |
newdf['Age'] = round((pd.to_datetime('today') - newdf['CustomerDOB'])/np.timedelta64(1, 'Y'))
newdf.shape
(985322, 8)
newdf.head()
| CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | Age | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1994-01-10 | F | JAMSHEDPUR | 17819.05 | 2016-08-02 | 143207 | 25.0 | 28.0 |
| 1 | 1957-04-04 | M | JHAJJAR | 2270.69 | 2016-08-02 | 141858 | 27999.0 | 65.0 |
| 2 | 1996-11-26 | F | MUMBAI | 17874.44 | 2016-08-02 | 142712 | 459.0 | 25.0 |
| 3 | 1973-09-14 | F | MUMBAI | 866503.21 | 2016-08-02 | 142714 | 2060.0 | 48.0 |
| 4 | 1988-03-24 | F | NAVI MUMBAI | 6714.43 | 2016-08-02 | 181156 | 1762.5 | 34.0 |
newdf['TransactionAmount'].describe() # As the data is huge we try to take values in transaction above median.
count 9.853220e+05 mean 1.452425e+03 std 6.139765e+03 min 0.000000e+00 25% 1.510000e+02 50% 4.310000e+02 75% 1.125000e+03 max 1.560035e+06 Name: TransactionAmount, dtype: float64
Loc_TA = sqldf('select Age,CustAccountBalance,CustGender,CustLocation,TransactionAmount from newdf where TransactionAmount > 500 group by CustLocation order by TransactionAmount desc')
Loc_TA
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 42.0 | 798753.95 | M | GATE NO 4 MUMBAI | 455122.00 |
| 1 | 35.0 | 90123.52 | M | JAIPUR DURGAPURA | 378415.46 |
| 2 | 45.0 | 215315.01 | M | ROOMFORD | 117621.00 |
| 3 | 44.0 | 43113.77 | M | MILTON KEYNES | 104478.00 |
| 4 | 38.0 | 73542.33 | M | PHASE-01 HINJEWADI PUNE | 100000.00 |
| ... | ... | ... | ... | ... | ... |
| 6662 | 38.0 | 8387.04 | M | NAGAR CHENNAI | 501.27 |
| 6663 | 31.0 | 5075.39 | M | THALAYOLAPARAMBU | 501.00 |
| 6664 | 30.0 | 76438.86 | F | PETHAPUR GANDHI NAGAR | 501.00 |
| 6665 | 70.0 | 90260.19 | M | DAHISAR E MUMBAI | 501.00 |
| 6666 | 32.0 | 494674.28 | F | BORIVALI E | 501.00 |
6667 rows × 5 columns
Loc_TA ['TransactionAmount'].median()
1298.0
Loc_TA.shape
(6667, 5)
import plotly.express as px
fig = px.scatter(Loc_TA, x='Age', y='TransactionAmount',color ='CustGender')
fig.update_layout(title_text="Age vs. Transaction amount",title_x=0.5)
fig.show()
Loc_TA_filter= Loc_TA[Loc_TA['TransactionAmount']< 50000]
import plotly.express as px
fig = px.scatter(Loc_TA_filter, x='Age', y='TransactionAmount',color ='CustGender')
fig.update_layout(title_text="Age vs. Transaction amount",title_x=0.5)
fig.show()
labelencoder = LabelEncoder()
Loc_TA['CustGender'] = labelencoder.fit_transform(Loc_TA['CustGender'])
Loc_TA['CustLocation'] = labelencoder.fit_transform(Loc_TA['CustLocation'])
Loc_TA.head()
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 42.0 | 798753.95 | 1 | 2060 | 455122.00 |
| 1 | 35.0 | 90123.52 | 1 | 2542 | 378415.46 |
| 2 | 45.0 | 215315.01 | 1 | 5140 | 117621.00 |
| 3 | 44.0 | 43113.77 | 1 | 3700 | 104478.00 |
| 4 | 38.0 | 73542.33 | 1 | 4673 | 100000.00 |
scaler = StandardScaler()
Loc_TA1= scaler.fit_transform(Loc_TA)
Loc_TA1
array([[ 5.27984731e-01, 1.25170860e+00, 5.91727773e-01,
-6.61437139e-01, 4.86905810e+01],
[-2.57470607e-01, -2.06569146e-02, 5.91727773e-01,
-4.10995112e-01, 4.04302057e+01],
[ 8.64608447e-01, 2.04127864e-01, 5.91727773e-01,
9.38897808e-01, 1.23457670e+01],
...,
[-8.18510134e-01, -4.52280996e-02, -1.68996631e+00,
6.84299067e-01, -2.66653699e-01],
[ 3.66980608e+00, -2.04115198e-02, 5.91727773e-01,
-9.90337146e-01, -2.66653699e-01],
[-5.94094323e-01, 7.05725152e-01, -1.68996631e+00,
-1.25324932e+00, -2.66653699e-01]])
saved_cols = Loc_TA.columns
Loc_TA1 = pd.DataFrame(Loc_TA1, columns = saved_cols) # Transformin the data from array to Data Frame
Loc_TA1
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 0.527985 | 1.251709 | 0.591728 | -0.661437 | 48.690581 |
| 1 | -0.257471 | -0.020657 | 0.591728 | -0.410995 | 40.430206 |
| 2 | 0.864608 | 0.204128 | 0.591728 | 0.938898 | 12.345767 |
| 3 | 0.752401 | -0.105064 | 0.591728 | 0.190689 | 10.930423 |
| 4 | 0.079153 | -0.050429 | 0.591728 | 0.696250 | 10.448196 |
| ... | ... | ... | ... | ... | ... |
| 6662 | 0.079153 | -0.167417 | 0.591728 | 0.316949 | -0.266625 |
| 6663 | -0.706302 | -0.173363 | 0.591728 | 1.367039 | -0.266654 |
| 6664 | -0.818510 | -0.045228 | -1.689966 | 0.684299 | -0.266654 |
| 6665 | 3.669806 | -0.020412 | 0.591728 | -0.990337 | -0.266654 |
| 6666 | -0.594094 | 0.705725 | -1.689966 | -1.253249 | -0.266654 |
6667 rows × 5 columns
#Finding the optimum number of clusters for k-means classification - ELBOW METHOD
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
kmeans.fit(Loc_TA1)
wcss.append(kmeans.inertia_)
#Plotting the results onto a line graph, allowing us to observe 'The elbow'
plt.plot(range(1, 11), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') #within cluster sum of squares
plt.show()
from sklearn.cluster import KMeans
# cluster 6
kmeans = KMeans(n_clusters=6)
kmeans.fit(Loc_TA1)
y_kmeans = kmeans.predict(Loc_TA1)
y_kmeans
array([2, 2, 5, ..., 3, 5, 3], dtype=int32)
df3 = Loc_TA1.assign(clusterNo = y_kmeans, namex = Loc_TA1['CustGender'])
print(df3.head())
Age CustAccountBalance CustGender CustLocation TransactionAmount \ 0 0.527985 1.251709 0.591728 -0.661437 48.690581 1 -0.257471 -0.020657 0.591728 -0.410995 40.430206 2 0.864608 0.204128 0.591728 0.938898 12.345767 3 0.752401 -0.105064 0.591728 0.190689 10.930423 4 0.079153 -0.050429 0.591728 0.696250 10.448196 clusterNo namex 0 2 0.591728 1 2 0.591728 2 5 0.591728 3 5 0.591728 4 5 0.591728
Loc_TA1.head()
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 0.527985 | 1.251709 | 0.591728 | -0.661437 | 48.690581 |
| 1 | -0.257471 | -0.020657 | 0.591728 | -0.410995 | 40.430206 |
| 2 | 0.864608 | 0.204128 | 0.591728 | 0.938898 | 12.345767 |
| 3 | 0.752401 | -0.105064 | 0.591728 | 0.190689 | 10.930423 |
| 4 | 0.079153 | -0.050429 | 0.591728 | 0.696250 | 10.448196 |
# OTHER TECH: HIERACHICAL DENDROGRAM
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
linked = linkage(Loc_TA1, 'average')
#linked
#print(type(linked))
plt.figure(figsize=(30, 30))
zx = dendrogram(linked, orientation='top',labels=Loc_TA.index, color_threshold=60, above_threshold_color='red')
plt.show()